import os
import ast
import sys
import shutil
import joblib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from pprint import pformat, pprint
from helpers import _build_summary_data, plot_missing_values_per_group, ModelEvaluation
from misc.df_plotter.mv_plotter import MissingValuePlotter
from misc.df_plotter.box_plotter import BoxPlotter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.impute import KNNImputer
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import estimator_html_repr
from IPython.core.display import display, HTML
from IPython.display import Markdown as md
from IPython.display import Image
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, plot_roc_curve, plot_precision_recall_curve
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
# REMARK: change parameters of this cell and the script/notebook will run for either longitudinal
# or base evaluation, but markdown text will mostly not be influenced. So markdown cells have
# to be adjusted by hand in every script!
study_type = 'base' # 'base'
target = 'group' # target feature as string
target_mapping = {'positive': 1, 'negative': 0}
working_file = "working_file_base_2021-03-22" # name of working file (without ending) as string
chosen_model = None # 'AdaBoostClassifier' # 'KNeighborsClassifier' # 'LogisticRegression' # if None best model (mean test accuracy) is taken
re_calculate = False # time consuming calculations will not be done again if re_calculate = False and file exists in folder
no_features = 15 # number of features for SFS (Feature Importance)
test_size = 0.25 # chosen size of analyzed test set
This notebook shows the methodical way of analyzing the base data of COV19 (positive vs. negative): troponin = 'all'
run 'train_test_split_analysis.py' with repeated randomized train-test-splits with different split-sizes -> evaluate the optimal fraction to split into train- and test data .
selecting 'optimal' train-test-split-size-fraction and getting best model from all supported model types (LogisticRegression, KNeighborsClassifier, RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, SVC). Best model = reaching best mean test-accuracy-score (on unseen data)
With this model and its best hyperparameters retrain all data and analyze results with its FeatureImportance
In this section the behaviour between train-data-size and test-data-size is analyzed. Goal ist to get an optimal fraction the reach stable and valid results. An optimal fraction is reached if variance in results on train- and test-data is relativly low. Normally many train-data and few test-data leeds to a low variance in train but a high variance in test-results and visa verse. Somewhere between should be an optimum. This fraction is taken for further analysis!
if study_type == 'long':
data = pd.read_pickle(f"{working_file}.pkl")
else:
data = pd.read_pickle(f"{working_file}_adjusted.pkl")
fig = MissingValuePlotter(df=data).get_plot()
fig.write_html(f"MissingValuePlotter.html")
fig.show()
data_summary = _build_summary_data(data, target)
fig = plot_missing_values_per_group(data_summary, target)
fig.write_html(f"mv_per_group_before_transformation.html")
fig.show()
From train-test-split-analysis the next 2 plots have been generated:
try:
sim_data = pd.read_csv(f"train_test_split_simulation.csv", index_col=0)
except FileNotFoundError:
raise Exception(f"File not found!")
data_mod = sim_data.melt(id_vars=['algorithm', 'test_size', 'random_state'],
value_vars=['mean_cv_accuracy', 'mean_train_accuracy', 'test_accuracy'])
# build plot
sns.set_style("whitegrid")
sns.catplot(data=data_mod, x='test_size', y='value', col='algorithm', hue='variable', kind='box', col_wrap=2)
plt.savefig(f"train_test_analysis.png")
These plots show the estimated behaviour. With low test-sizes (0.1) the variance in test_accuracy is high. This variance is getting lower if the test_size is increased. From visual sight an optimum could be around a test-size of 0.25. At this point variance of train- and test-accuracy seems to be relatively low.
It's also visible that the GradientBoostingClassifier seems to overfit data as seen in the huge gap between mean_train_accuracy and test accuracy.
In a second step the best model of all supported models and hyperparameters is selected. The best model is this one reaching the best mean test-accuracy-score (accuracy score on unseen data). After catching the optimal train-test-split-size of 0.25 only these results will be considered on next steps of this analysis!
Output of former analysis have been two .csv file with all best models (hyperparameters) per algorithm and train-test-split/random_state.
pd.DataFrame(sim_data)
| algorithm | simulation | best_params | mean_cv_accuracy | cv_std | mean_train_accuracy | train_std | test_accuracy | test_size | random_state | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | LogisticRegression | 0 | {'estimator__C': 10, 'estimator__penalty': 'l2... | 0.758018 | 0.024041 | 0.778610 | 0.007270 | 0.846154 | 0.10 | 1 |
| 1 | KNeighborsClassifier | 0 | {'estimator__n_neighbors': 3, 'prep__num_pipel... | 0.714890 | 0.014832 | 0.823966 | 0.017201 | 0.730769 | 0.10 | 1 |
| 2 | RandomForestClassifier | 0 | {'estimator__criterion': 'gini', 'estimator__m... | 0.749579 | 0.045452 | 0.755400 | 0.005510 | 0.730769 | 0.10 | 1 |
| 3 | AdaBoostClassifier | 0 | {'estimator__base_estimator': DecisionTreeClas... | 0.764563 | 0.012817 | 0.802913 | 0.008088 | 0.826923 | 0.10 | 1 |
| 4 | BaggingClassifier | 0 | {'estimator__base_estimator': DecisionTreeClas... | 0.777536 | 0.016080 | 0.878491 | 0.017956 | 0.846154 | 0.10 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 625 | RandomForestClassifier | 89 | {'estimator__criterion': 'gini', 'estimator__m... | 0.736590 | 0.063489 | 0.759735 | 0.011106 | 0.745856 | 0.35 | 37 |
| 626 | AdaBoostClassifier | 89 | {'estimator__base_estimator': DecisionTreeClas... | 0.763546 | 0.026760 | 0.877992 | 0.003876 | 0.734807 | 0.35 | 37 |
| 627 | BaggingClassifier | 89 | {'estimator__base_estimator': DecisionTreeClas... | 0.769561 | 0.019213 | 0.818128 | 0.014119 | 0.773481 | 0.35 | 37 |
| 628 | GradientBoostingClassifier | 89 | {'estimator__learning_rate': 0.15, 'estimator_... | 0.796336 | 0.035243 | 0.979784 | 0.006098 | 0.767956 | 0.35 | 37 |
| 629 | SVC | 89 | {'estimator__C': 1, 'estimator__kernel': 'line... | 0.772411 | 0.022373 | 0.793412 | 0.007285 | 0.751381 | 0.35 | 37 |
630 rows × 10 columns
From this tables only test_size=0.25 will be taken for further analysis: The best model overall is the model with the best mean-test-accuracy (on test-size=0.25). It's best hyperparameters are detected in a majority-vote
size_test_data = 0.25
def build_best_param_from_str(params):
best_param = {}
param_str = params.index[0]
param_str = param_str[1:-1]
param_str_list = str.split(param_str, ',')
for item in param_str_list:
item_list = str.split(item, ':')
for ch in ['"', '\\', "'", " "]:
item_list[0] = item_list[0].replace(ch, '')
best_param.update({item_list[0]: eval(item_list[1])})
return best_param
for i, sd in enumerate([sim_data]):
data_red = sd[sd['test_size'] == size_test_data] # optimum between variance in train- and test-score (watch plot)
best_algorithms = data_red.groupby("algorithm").mean().sort_values(by=['test_accuracy', 'mean_cv_accuracy'], ascending=False)
if chosen_model is None:
best_algorithm = best_algorithms.index[0]
idx = 0
else:
best_algorithm = chosen_model
idx = best_algorithms.index.get_loc(chosen_model)
best_params = data_red[data_red['algorithm'] == best_algorithm].groupby('best_params').count().sort_values(by="algorithm", ascending=False)
# best_param = ast.literal_eval(best_params.index[idx])
best_param = build_best_param_from_str(best_params)
# performances of best algorithm
stats = data_red.loc[:, ['mean_cv_accuracy', 'mean_train_accuracy', 'test_accuracy']][
data_red['algorithm'] == best_algorithm].agg(['mean', 'std'])
results = {best_algorithm: best_param, 'statistics': stats}
joblib.dump(results, f"best_model.joblib")
print(f"{best_algorithm}\n")
print(f"{best_param}\n")
print(f"{stats}")
GradientBoostingClassifier
{'estimator__learning_rate': 0.05, 'estimator__max_depth': 2, 'estimator__max_features': 'auto', 'estimator__min_samples_leaf': 0.1, 'estimator__subsample': 0.9, 'prep__num_pipeline__imputer__n_neighbors': 3}
mean_cv_accuracy mean_train_accuracy test_accuracy
mean 0.788820 0.902203 0.775711
std 0.013799 0.043169 0.035216
In former chapters no word about the model pipelines and it's tuning parameters have been lost. These will be shown in this section. Will will analyze data and it's predictions with best evaluated model in section 2.
if study_type == 'long':
data = pd.read_pickle(f"{working_file}.pkl")
else:
data = pd.read_pickle(f"{working_file}_adjusted.pkl")
data.head(5)
| group | alt_ | antibody-synth-lymph_ | aptt_ | ast_ | ck_mb_ | creatinine_ | crp_ | hgb_ | mpv_ | ... | pt_ | reactive-lympho_ | troponin_ | wbc_ | quot_neu_lymphocytes_ | quot_eos_lymphocytes_ | quot_mon_lymphocytes_ | quot_plt_lymphocytes_ | quot_baso_lymphocytes_ | quot_plt_neu_ | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | |||||||||||||||||||||
| SC2_Pos1 | positive | 11.0 | 0.00 | 78.5 | 23.0 | NaN | 125.1 | 0.14 | 13.6 | 11.1 | ... | 18.1 | 0.10 | 0.054 | 12.76 | 10.882353 | 0.088235 | 0.470588 | 202.941176 | 0.039216 | 18.648649 |
| SC2_Pos10 | positive | 12.0 | 0.00 | 31.3 | 13.0 | 0.9 | 93.5 | 0.90 | 15.0 | 11.6 | ... | 15.3 | 0.02 | 0.005 | 6.63 | 5.974359 | 0.012821 | 1.448718 | 285.897436 | 0.038462 | 47.854077 |
| SC2_Pos100 | positive | 33.0 | 0.04 | 24.6 | 87.0 | 9.7 | 54.1 | 10.35 | 13.7 | 10.2 | ... | 12.3 | 0.09 | 1.475 | 6.79 | 6.091954 | 0.000000 | 0.678161 | 180.459770 | 0.011494 | 29.622642 |
| SC2_Pos101 | positive | 11.0 | 0.00 | 31.8 | 16.0 | 0.3 | 109.8 | 8.55 | 13.8 | 10.5 | ... | 13.9 | 0.01 | 0.026 | 9.06 | 5.840336 | 0.000000 | 0.739496 | 164.705882 | 0.016807 | 28.201439 |
| SC2_Pos102 | positive | 9.0 | 0.00 | 27.0 | 15.0 | 1.6 | 94.2 | 11.75 | 13.4 | 11.1 | ... | 15.5 | 0.07 | 0.041 | 8.07 | 4.268116 | 0.014493 | 0.528986 | 149.275362 | 0.014493 | 34.974533 |
5 rows × 24 columns
data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| alt_ | 384.0 | 53.867188 | 189.970853 | 5.000000 | 14.000000 | 24.500000 | 41.000000 | 2864.000000 |
| antibody-synth-lymph_ | 477.0 | 0.010042 | 0.029270 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.240000 |
| aptt_ | 458.0 | 32.975983 | 13.269349 | 16.600000 | 27.100000 | 30.500000 | 35.075000 | 222.600000 |
| ast_ | 378.0 | 70.383598 | 227.797081 | 7.000000 | 21.000000 | 30.000000 | 55.000000 | 3239.000000 |
| ck_mb_ | 426.0 | 4.141080 | 27.491986 | 0.300000 | 0.700000 | 1.300000 | 2.900000 | 557.400000 |
| creatinine_ | 504.0 | 145.095040 | 190.597133 | 15.300000 | 71.300000 | 90.300000 | 130.925000 | 1806.100000 |
| crp_ | 507.0 | 8.361105 | 9.016606 | 0.050000 | 1.000000 | 5.000000 | 13.480000 | 41.900000 |
| hgb_ | 510.0 | 12.757196 | 2.448105 | 0.070000 | 11.300000 | 13.000000 | 14.500000 | 20.300000 |
| mpv_ | 501.0 | 10.716168 | 1.064649 | 8.500000 | 9.900000 | 10.500000 | 11.400000 | 14.200000 |
| neutrophil-granules_ | 504.0 | 154.725595 | 5.226705 | 132.600000 | 151.500000 | 154.850000 | 158.325000 | 186.500000 |
| neutrophil-reactivity_ | 504.0 | 48.545238 | 5.550740 | 36.900000 | 45.600000 | 47.700000 | 50.225000 | 89.300000 |
| pct_ | 390.0 | 2.599103 | 9.971618 | 0.010000 | 0.040000 | 0.160000 | 0.807500 | 131.530000 |
| pdw_ | 496.0 | 12.667540 | 2.684248 | 8.100000 | 10.800000 | 12.200000 | 13.900000 | 24.500000 |
| pt_ | 459.0 | 16.279739 | 8.495689 | 9.400000 | 12.700000 | 13.900000 | 16.100000 | 85.100000 |
| reactive-lympho_ | 478.0 | 0.081862 | 0.068006 | 0.000000 | 0.040000 | 0.070000 | 0.107500 | 0.590000 |
| troponin_ | 459.0 | 0.218795 | 1.727865 | 0.001000 | 0.006000 | 0.019000 | 0.064500 | 35.670000 |
| wbc_ | 507.0 | 11.889822 | 8.065063 | 0.800000 | 7.095000 | 10.290000 | 14.250000 | 92.420000 |
| quot_neu_lymphocytes_ | 502.0 | 11.348163 | 16.257281 | 0.314607 | 3.482060 | 6.961163 | 12.629808 | 176.000000 |
| quot_eos_lymphocytes_ | 502.0 | 0.181114 | 2.156734 | 0.000000 | 0.000000 | 0.014388 | 0.050817 | 46.685393 |
| quot_mon_lymphocytes_ | 502.0 | 0.881307 | 1.377858 | 0.028571 | 0.366263 | 0.578079 | 0.980297 | 24.666667 |
| quot_plt_lymphocytes_ | 503.0 | 257.031980 | 297.812459 | 11.560694 | 126.288098 | 187.500000 | 279.683818 | 4200.000000 |
| quot_baso_lymphocytes_ | 502.0 | 0.048944 | 0.246116 | 0.000000 | 0.014337 | 0.023483 | 0.042736 | 5.359551 |
| quot_plt_neu_ | 504.0 | 36.555886 | 34.808279 | 2.430134 | 18.217557 | 28.543011 | 45.515152 | 433.333333 |
data.info()
<class 'pandas.core.frame.DataFrame'> Index: 515 entries, SC2_Pos1 to SC2_Neg314 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 group 515 non-null category 1 alt_ 384 non-null float64 2 antibody-synth-lymph_ 477 non-null float64 3 aptt_ 458 non-null float64 4 ast_ 378 non-null float64 5 ck_mb_ 426 non-null float64 6 creatinine_ 504 non-null float64 7 crp_ 507 non-null float64 8 hgb_ 510 non-null float64 9 mpv_ 501 non-null float64 10 neutrophil-granules_ 504 non-null float64 11 neutrophil-reactivity_ 504 non-null float64 12 pct_ 390 non-null float64 13 pdw_ 496 non-null float64 14 pt_ 459 non-null float64 15 reactive-lympho_ 478 non-null float64 16 troponin_ 459 non-null float64 17 wbc_ 507 non-null float64 18 quot_neu_lymphocytes_ 502 non-null float64 19 quot_eos_lymphocytes_ 502 non-null float64 20 quot_mon_lymphocytes_ 502 non-null float64 21 quot_plt_lymphocytes_ 503 non-null float64 22 quot_baso_lymphocytes_ 502 non-null float64 23 quot_plt_neu_ 504 non-null float64 dtypes: category(1), float64(23) memory usage: 97.2+ KB
if study_type == 'base':
if not os.path.isfile('pairplot.png'):
plt.figure(figsize=(15,8))
fig = sns.pairplot(data=data, hue='group', plot_kws={'alpha':0.2})
fig.savefig('pairplot.png')
else:
display(Image(filename='pairplot.png'))
<Figure size 1080x576 with 0 Axes>
mv = data.isna().sum().sum()
frac = round((mv / (data.shape[0]*data.shape[1])) * 100, 3)
md("The data contains <b>{}</b> missing values in overall, that is a fraction of <b>{}</b>%.".format(mv, frac))
The data contains 888 missing values in overall, that is a fraction of 7.184%.
The missing value-plot is already shown in part 1 as well as the missing data per subgropup.
if study_type == 'base':
bp = BoxPlotter(df=data, features=None, split=target)
bp.update_layout(n_cols=3, n_rows=3, style_grid={'vertical_spacing': 0.1, 'horizontal_spacing': 0.1},
colors={'positive': 'green', 'negative': 'red'}, style_figure={'boxpoints': 'all'})
figs = bp.get_plot()
for idx, item in enumerate(figs):
item.update_layout(title_text=f"Boxplot {idx}", width=1600, height=800)
item.show()
bp.store(path=f'{os.getcwd()}', name=f"BoxPlotter")
plt.figure(figsize=(8,6))
sns.heatmap(data=data.corr(), cmap='coolwarm')
plt.show()
Next target variable has to be replaced by numeric values to calculate correlation of features to target.
data_mod = data.copy()
data_mod[target] = data_mod[target].map(target_mapping)
data_mod[target] = data_mod[target].astype('float')
data_mod.corr()[target].sort_values()
quot_neu_lymphocytes_ -0.280672 wbc_ -0.252697 quot_mon_lymphocytes_ -0.197556 pct_ -0.187735 pt_ -0.167718 quot_plt_lymphocytes_ -0.142922 neutrophil-granules_ -0.132211 crp_ -0.129023 ast_ -0.098254 alt_ -0.093577 quot_baso_lymphocytes_ -0.091726 neutrophil-reactivity_ -0.062803 creatinine_ -0.062582 ck_mb_ -0.057095 quot_eos_lymphocytes_ -0.048955 troponin_ -0.048017 aptt_ -0.003143 pdw_ 0.049451 mpv_ 0.058571 hgb_ 0.070693 reactive-lympho_ 0.082932 antibody-synth-lymph_ 0.274321 quot_plt_neu_ 0.292911 group 1.000000 Name: group, dtype: float64
The pipeline for the ML-workflow looks as following:
pipe = joblib.load(f'pipe.joblib')
params = joblib.load(f'params.joblib')
print(pformat(params))
([{'prep__num_pipeline__imputer__n_neighbors': [1, 3, 5]}],
[{'estimator': [LogisticRegression()],
'estimator__C': [0.1, 1, 10],
'estimator__penalty': ['l2']},
{'estimator': [KNeighborsClassifier()],
'estimator__n_neighbors': [3, 5, 7, 9]},
{'estimator': [RandomForestClassifier()],
'estimator__criterion': ['gini', 'entropy'],
'estimator__max_depth': [1],
'estimator__min_samples_leaf': [0.005, 0.01, 0.05, 0.1],
'estimator__min_samples_split': [0.005, 0.01, 0.05, 0.1]},
{'estimator': [AdaBoostClassifier()],
'estimator__base_estimator': [DecisionTreeClassifier(max_depth=1)],
'estimator__learning_rate': [0.1, 0.25, 0.5, 0.75, 1.0]},
{'estimator': [BaggingClassifier()],
'estimator__base_estimator': [DecisionTreeClassifier(max_depth=1),
DecisionTreeClassifier(max_depth=2),
DecisionTreeClassifier(max_depth=3),
DecisionTreeClassifier(max_depth=4)],
'estimator__max_features': [0.2, 0.4, 0.6, 0.8, 1.0]},
{'estimator': [GradientBoostingClassifier()],
'estimator__learning_rate': [0.15, 0.1, 0.05, 0.01],
'estimator__max_depth': [1, 2],
'estimator__max_features': ['auto', 'sqrt', 'log2'],
'estimator__min_samples_leaf': [0.005, 0.01, 0.05, 0.1],
'estimator__subsample': [0.8, 0.9, 1]},
{'estimator': [SVC()],
'estimator__C': [0.1, 0.5, 1, 5, 10],
'estimator__kernel': ['linear', 'rbf', 'poly']}])
Best hyperparameters as well as the performance statistics have been evaluated in section above: These are:
results = joblib.load(f"best_model.joblib")
keys = list(results.keys())
best_algorithm_name = keys[0]
print(f"Best algorithm: {best_algorithm_name}\n")
print(f"Best parameters: {results.get(keys[0])}\n")
print(f"statistics: {results.get(keys[1])}")
Best algorithm: GradientBoostingClassifier
Best parameters: {'estimator__learning_rate': 0.05, 'estimator__max_depth': 2, 'estimator__max_features': 'auto', 'estimator__min_samples_leaf': 0.1, 'estimator__subsample': 0.9, 'prep__num_pipeline__imputer__n_neighbors': 3}
statistics: mean_cv_accuracy mean_train_accuracy test_accuracy
mean 0.788820 0.902203 0.775711
std 0.013799 0.043169 0.035216
The model will be build from pipeline by replacing the dummy estimator by the best estimator and update the hyperparameters in the pipe. After this the pipe can be fitted on overall data!
X = data.drop([target], axis=1)
y = data[target]
y = y.map(target_mapping) # for precision_scorer labels have to be binary with 0,1
updated_params = results.get(keys[0])
updated_params = {key:[value] for key,value in updated_params.items()} # values have to be defined as list
updated_pipe = pipe
updated_pipe.steps.pop(-1)
updated_pipe.steps.append(('estimator', eval(f"{keys[0]}()"))) # adding () behind best model name and evaluate -> function
kfold = KFold(n_splits=5, shuffle=True)
model = GridSearchCV(estimator=updated_pipe ,param_grid=updated_params, cv=kfold ,n_jobs=-1,
return_train_score=True, scoring='accuracy', refit=True)
model.fit(X, y)
display(HTML(estimator_html_repr(model.best_estimator_)))
Pipeline(steps=[('prep',
ColumnTransformer(n_jobs=-1,
transformers=[('num_pipeline',
Pipeline(steps=[('imputer',
KNNImputer(n_neighbors=3)),
('log_trans',
FunctionTransformer(func=)),
('min_max',
MinMaxScaler())]),
),
('cat_pipeline',
Pipeline(steps=[('oe',
OrdinalEncoder())]),
)])),
('estimator',
GradientBoostingClassifier(learning_rate=0.05, max_depth=2,
max_features='auto',
min_samples_leaf=0.1,
subsample=0.9))]) ColumnTransformer(n_jobs=-1,
transformers=[('num_pipeline',
Pipeline(steps=[('imputer',
KNNImputer(n_neighbors=3)),
('log_trans',
FunctionTransformer(func=)),
('min_max', MinMaxScaler())]),
),
('cat_pipeline',
Pipeline(steps=[('oe', OrdinalEncoder())]),
)]) KNNImputer(n_neighbors=3)
FunctionTransformer(func=)
MinMaxScaler()
OrdinalEncoder()
GradientBoostingClassifier(learning_rate=0.05, max_depth=2, max_features='auto',
min_samples_leaf=0.1, subsample=0.9)model.cv_results_
{'mean_fit_time': array([0.22806678]),
'std_fit_time': array([0.02872978]),
'mean_score_time': array([0.0213594]),
'std_score_time': array([0.00472682]),
'param_estimator__learning_rate': masked_array(data=[0.05],
mask=[False],
fill_value='?',
dtype=object),
'param_estimator__max_depth': masked_array(data=[2],
mask=[False],
fill_value='?',
dtype=object),
'param_estimator__max_features': masked_array(data=['auto'],
mask=[False],
fill_value='?',
dtype=object),
'param_estimator__min_samples_leaf': masked_array(data=[0.1],
mask=[False],
fill_value='?',
dtype=object),
'param_estimator__subsample': masked_array(data=[0.9],
mask=[False],
fill_value='?',
dtype=object),
'param_prep__num_pipeline__imputer__n_neighbors': masked_array(data=[3],
mask=[False],
fill_value='?',
dtype=object),
'params': [{'estimator__learning_rate': 0.05,
'estimator__max_depth': 2,
'estimator__max_features': 'auto',
'estimator__min_samples_leaf': 0.1,
'estimator__subsample': 0.9,
'prep__num_pipeline__imputer__n_neighbors': 3}],
'split0_test_score': array([0.81553398]),
'split1_test_score': array([0.73786408]),
'split2_test_score': array([0.7961165]),
'split3_test_score': array([0.77669903]),
'split4_test_score': array([0.82524272]),
'mean_test_score': array([0.79029126]),
'std_test_score': array([0.03106796]),
'rank_test_score': array([1], dtype=int32),
'split0_train_score': array([0.84466019]),
'split1_train_score': array([0.86407767]),
'split2_train_score': array([0.85679612]),
'split3_train_score': array([0.87378641]),
'split4_train_score': array([0.84466019]),
'mean_train_score': array([0.85679612]),
'std_train_score': array([0.01128053])}
md('Best model cross-validated score is <b>{}</b>'.format(model.best_score_))
Best model cross-validated score is 0.7902912621359224
The performance in prediction (example) can be slightly optimistic because the model was refit on all data (no train-, test). But in statistics above the 'true' (estimated) statistic values are shown.
y_pred = model.predict(X)
model.predict(X)[0:10]
array([0, 1, 1, 0, 1, 1, 1, 1, 1, 1])
try:
model.predict_proba(X)[0:10]
except:
pass
plt.Figure(figsize=(10,6))
plot_confusion_matrix(estimator=model, X=X, y_true=y)
plt.grid()
plt.savefig(f'confusion_matrix.png')
correct_classified = sum(model.predict(X) == y)
all_classified = len(y)
missclassified = all_classified - correct_classified
md('There are <b>{}</b> missclassifications on a total of <b>{}</b> observations'.format(missclassified, all_classified))
There are 83 missclassifications on a total of 515 observations
print(classification_report(y, y_pred))
precision recall f1-score support
0 0.83 0.92 0.87 314
1 0.86 0.71 0.77 201
accuracy 0.84 515
macro avg 0.84 0.82 0.82 515
weighted avg 0.84 0.84 0.84 515
Confusion matrix as well as the classification report showing a well distributed picture. Precision, recall and f1-score are very close, so the model seems to work quit accurate!
This section tries to highlight the most important features in the dataset. Two approaches are chosen:
position_estimator = [idx for idx, i in enumerate(list(model.best_estimator_.named_steps)) if i == 'estimator'][0]
best_estimator = model.best_estimator_.steps.pop(position_estimator)[1]
preprocessed_x = model.best_estimator_.fit_transform(X)
if isinstance(preprocessed_x, pd.DataFrame):
cols = preprocessed_x.columns
else:
cols = X.columns
if best_algorithm_name == 'LogisticRegression':
coefs = pd.Series(index=cols, data=best_estimator.coef_[0])
coefs = coefs.sort_values(ascending=False)
coefs_abs = abs(coefs).sort_values(ascending=False)
print(coefs)
# build plot
plt.figure(figsize=(10,4), dpi=100)
ax = sns.barplot(x=coefs_abs.index, y=coefs_abs.values)
for item in ax.get_xticklabels():
item.set_rotation(75)
fig = ax.get_figure()
fig.savefig(f'feature_importance.png')
md("The most important features seems to be <b>{}</b>".format(coefs[0:7].index.to_list()))
A second approach for the feature importance is the sequential feature selection (SFS).
SFS does not work with a pipeline object, therefore the estimator will be removed, data will be preprocessed with pipe without estimator and this estimator is given to SFS.
if re_calculate is False and os.path.isfile('feature_importance.joblib'):
feature_importance = joblib.load('feature_importance.joblib')
sfs = joblib.load('sfs.joblib')
else:
sfs = SFS(best_estimator,
k_features=no_features,
forward=True,
scoring='accuracy',
cv=5,
n_jobs=-1)
sfs = sfs.fit(preprocessed_x, y, custom_feature_names=cols.to_list())
feature_importance = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
joblib.dump(feature_importance, 'feature_importance.joblib')
joblib.dump(sfs, 'sfs.joblib')
feature_importance
| feature_idx | cv_scores | avg_score | feature_names | ci_bound | std_dev | std_err | |
|---|---|---|---|---|---|---|---|
| 1 | (16,) | [0.7087378640776699, 0.7184466019417476, 0.660... | 0.702913 | (wbc_,) | 0.0301558 | 0.0234622 | 0.0117311 |
| 2 | (1, 16) | [0.7475728155339806, 0.6990291262135923, 0.757... | 0.765049 | (antibody-synth-lymph_, wbc_) | 0.057672 | 0.0448708 | 0.0224354 |
| 3 | (1, 16, 18) | [0.7572815533980582, 0.7378640776699029, 0.766... | 0.774757 | (antibody-synth-lymph_, wbc_, quot_eos_lymphoc... | 0.0420585 | 0.0327229 | 0.0163615 |
| 4 | (1, 6, 16, 18) | [0.7572815533980582, 0.7184466019417476, 0.766... | 0.770874 | (antibody-synth-lymph_, crp_, wbc_, quot_eos_l... | 0.0483936 | 0.0376519 | 0.0188259 |
| 5 | (1, 6, 8, 16, 18) | [0.7572815533980582, 0.7281553398058253, 0.757... | 0.76699 | (antibody-synth-lymph_, crp_, mpv_, wbc_, quot... | 0.0410087 | 0.0319062 | 0.0159531 |
| 6 | (1, 2, 6, 8, 16, 18) | [0.7669902912621359, 0.7378640776699029, 0.757... | 0.759223 | (antibody-synth-lymph_, aptt_, crp_, mpv_, wbc... | 0.0214689 | 0.0167035 | 0.00835177 |
| 7 | (1, 2, 6, 8, 16, 18, 21) | [0.7572815533980582, 0.7572815533980582, 0.776... | 0.763107 | (antibody-synth-lymph_, aptt_, crp_, mpv_, wbc... | 0.0149743 | 0.0116505 | 0.00582524 |
| 8 | (1, 2, 5, 6, 8, 16, 18, 21) | [0.7669902912621359, 0.7669902912621359, 0.776... | 0.770874 | (antibody-synth-lymph_, aptt_, creatinine_, cr... | 0.00611322 | 0.00475629 | 0.00237815 |
| 9 | (1, 2, 5, 6, 8, 13, 16, 18, 21) | [0.7572815533980582, 0.7475728155339806, 0.766... | 0.770874 | (antibody-synth-lymph_, aptt_, creatinine_, cr... | 0.0231443 | 0.018007 | 0.00900351 |
| 10 | (1, 2, 5, 6, 7, 8, 13, 16, 18, 21) | [0.7766990291262136, 0.7475728155339806, 0.766... | 0.768932 | (antibody-synth-lymph_, aptt_, creatinine_, cr... | 0.0165547 | 0.0128801 | 0.00644005 |
| 11 | (1, 2, 5, 6, 7, 8, 13, 16, 18, 20, 21) | [0.7572815533980582, 0.7572815533980582, 0.757... | 0.768932 | (antibody-synth-lymph_, aptt_, creatinine_, cr... | 0.0183397 | 0.0142689 | 0.00713444 |
| 12 | (1, 2, 4, 5, 6, 7, 8, 13, 16, 18, 20, 21) | [0.7669902912621359, 0.7475728155339806, 0.766... | 0.76699 | (antibody-synth-lymph_, aptt_, ck_mb_, creatin... | 0.0208806 | 0.0162458 | 0.00812291 |
| 13 | (1, 2, 4, 5, 6, 7, 8, 9, 13, 16, 18, 20, 21) | [0.7572815533980582, 0.7864077669902912, 0.766... | 0.770874 | (antibody-synth-lymph_, aptt_, ck_mb_, creatin... | 0.0127257 | 0.00990101 | 0.0049505 |
| 14 | (0, 1, 2, 4, 5, 6, 7, 8, 9, 13, 16, 18, 20, 21) | [0.7475728155339806, 0.7766990291262136, 0.786... | 0.768932 | (alt_, antibody-synth-lymph_, aptt_, ck_mb_, c... | 0.0183397 | 0.0142689 | 0.00713444 |
| 15 | (0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 16, 18, 20... | [0.7669902912621359, 0.7864077669902912, 0.776... | 0.770874 | (alt_, antibody-synth-lymph_, aptt_, ck_mb_, c... | 0.0169267 | 0.0131696 | 0.00658479 |
def transform_feature_importance_to_list(feature_importance):
feature_list = []
for idx, item in enumerate(feature_importance['feature_names']):
if idx == 0:
feature_list.append(item[0])
else:
diff = [i for i in item if i not in feature_list]
feature_list.append(diff[0])
return feature_list
feature_importance_list = transform_feature_importance_to_list(feature_importance)
md("The most important features seems to be <br><br> <b>{}</b> <br><br> -> ordered by importance from important to less important!".format(feature_importance_list))
The most important features seems to be
['wbc_', 'antibody-synth-lymph_', 'quot_eos_lymphocytes_', 'crp_', 'mpv_', 'aptt_', 'quot_baso_lymphocytes_', 'creatinine_', 'pt_', 'hgb_', 'quot_plt_lymphocytes_', 'ck_mb_', 'neutrophil-granules_', 'alt_', 'neutrophil-reactivity_']
-> ordered by importance from important to less important!
plt.Figure(figsize=(10,6))
plot_sfs(sfs.get_metric_dict(), kind='std_dev')
plt.ylim([0.6,1.0])
plt.title('Sequential Forward Selection (w. StdDev)')
plt.savefig('sfs.png')
no_features = 10
if study_type == 'long':
wf = f"{working_file}"
else:
wf = f"{working_file}_adjusted"
evaluation = ModelEvaluation(working_file = f"{wf}.pkl",
target=target,
simulation_file='train_test_split_simulation.csv',
pipe='pipe.joblib')
if re_calculate is False and os.path.isfile('overview_models.joblib'):
overview_models = joblib.load('overview_models.joblib')
else:
overview_models = evaluation.analyze_models(test_size=test_size, no_features=no_features)
joblib.dump(overview_models, 'overview_models.joblib')
pprint(overview_models)
[{'1_name': 'LogisticRegression',
'2_best_param': {'estimator__C': 1,
'estimator__penalty': 'l2',
'prep__num_pipeline__imputer__n_neighbors': 5},
'3_stats': mean_cv_accuracy mean_train_accuracy test_accuracy
mean 0.757054 0.782816 0.767959
std 0.017027 0.013339 0.041381,
'4_features': ['wbc_',
'antibody-synth-lymph_',
'quot_plt_neu_',
'pct_',
'crp_',
'troponin_',
'alt_',
'creatinine_',
'quot_baso_lymphocytes_',
'aptt_']},
{'1_name': 'KNeighborsClassifier',
'2_best_param': {'estimator__n_neighbors': 9,
'prep__num_pipeline__imputer__n_neighbors': 3},
'3_stats': mean_cv_accuracy mean_train_accuracy test_accuracy
mean 0.719727 0.771588 0.717313
std 0.015815 0.016130 0.047684,
'4_features': ['quot_plt_neu_',
'neutrophil-reactivity_',
'wbc_',
'antibody-synth-lymph_',
'quot_eos_lymphocytes_',
'quot_baso_lymphocytes_',
'troponin_',
'pct_',
'quot_mon_lymphocytes_',
'quot_plt_lymphocytes_']},
{'1_name': 'RandomForestClassifier',
'2_best_param': {'estimator__criterion': 'entropy',
'estimator__max_depth': 1,
'estimator__min_samples_leaf': 0.01,
'estimator__min_samples_split': 0.01,
'prep__num_pipeline__imputer__n_neighbors': 5},
'3_stats': mean_cv_accuracy mean_train_accuracy test_accuracy
mean 0.744045 0.755313 0.759173
std 0.011894 0.012557 0.038366,
'4_features': ['quot_plt_neu_',
'quot_baso_lymphocytes_',
'wbc_',
'ast_',
'creatinine_',
'quot_mon_lymphocytes_',
'pdw_',
'antibody-synth-lymph_',
'mpv_',
'crp_']},
{'1_name': 'GradientBoostingClassifier',
'2_best_param': {'estimator__learning_rate': 0.05,
'estimator__max_depth': 2,
'estimator__max_features': 'auto',
'estimator__min_samples_leaf': 0.1,
'estimator__subsample': 0.9,
'prep__num_pipeline__imputer__n_neighbors': 3},
'3_stats': mean_cv_accuracy mean_train_accuracy test_accuracy
mean 0.788820 0.902203 0.775711
std 0.013799 0.043169 0.035216,
'4_features': ['wbc_',
'antibody-synth-lymph_',
'quot_eos_lymphocytes_',
'mpv_',
'hgb_',
'quot_baso_lymphocytes_',
'aptt_',
'reactive-lympho_',
'neutrophil-reactivity_',
'neutrophil-granules_']},
{'1_name': 'SVC',
'2_best_param': {'estimator__C': 1,
'estimator__kernel': 'rbf',
'prep__num_pipeline__imputer__n_neighbors': 1},
'3_stats': mean_cv_accuracy mean_train_accuracy test_accuracy
mean 0.765170 0.800304 0.763307
std 0.015411 0.020460 0.037735,
'4_features': ['wbc_',
'antibody-synth-lymph_',
'pct_',
'quot_baso_lymphocytes_',
'reactive-lympho_',
'quot_eos_lymphocytes_',
'alt_',
'quot_plt_neu_',
'creatinine_',
'quot_plt_lymphocytes_']},
{'1_name': 'AdaBoostClassifier',
'2_best_param': {'estimator__base_estimator': DecisionTreeClassifier(max_depth=1),
'estimator__learning_rate': 0.1,
'prep__num_pipeline__imputer__n_neighbors': 1},
'3_stats': mean_cv_accuracy mean_train_accuracy test_accuracy
mean 0.761871 0.839598 0.766408
std 0.012989 0.032978 0.029133,
'4_features': ['quot_neu_lymphocytes_',
'antibody-synth-lymph_',
'neutrophil-reactivity_',
'pct_',
'wbc_',
'pt_',
'quot_mon_lymphocytes_',
'creatinine_',
'crp_',
'pdw_']},
{'1_name': 'BaggingClassifier',
'2_best_param': {'estimator__base_estimator': DecisionTreeClassifier(max_depth=3),
'estimator__max_features': 0.6,
'prep__num_pipeline__imputer__n_neighbors': 5},
'3_stats': mean_cv_accuracy mean_train_accuracy test_accuracy
mean 0.764289 0.850737 0.775711
std 0.013732 0.035051 0.035580,
'4_features': ['wbc_',
'antibody-synth-lymph_',
'quot_neu_lymphocytes_',
'crp_',
'pct_',
'quot_baso_lymphocytes_',
'quot_mon_lymphocytes_',
'quot_plt_neu_',
'neutrophil-granules_',
'mpv_']}]
%%javascript
IPython.notebook.save_notebook()
# store this notebook as html in specific folder
os.system(f'jupyter nbconvert --to html final_evaluation.ipynb')
0